In [1]:
import pandas as pd
import numpy as np 
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import plotly.plotly as py
from plotly import tools
from datetime import date
import seaborn as sns
import random 
import warnings
warnings.filterwarnings("ignore")
init_notebook_mode(connected=True)
import zipfile
import os
import winsound
In [2]:
import sys
In [3]:
sys.path.append('c:\\users\\chait\\anaconda3\\lib\\site-packages')
In [4]:
"""Helper Functions to do Plotting"""
def generateLayoutBar(col_name):
    """
    Generate a layout object for bar chart
    """
    layout_bar = go.Layout(
        autosize=False,  # auto size the graph? use False if you are specifying the height and width
        width=800,  # height of the figure in pixels
        height=600,  # height of the figure in pixels
        title="Distribution of {} column".format(col_name),  # title of the figure
        # more granular control on the title font
        titlefont=dict(
            family='Courier New, monospace',  # font family
            size=14,  # size of the font
            color='black'  # color of the font
        ),
        # granular control on the axes objects
        xaxis=dict(
            tickfont=dict(
                family='Courier New, monospace',  # font family
                size=14,  # size of ticks displayed on the x axis
                color='black'  # color of the font
            )
        ),
        yaxis=dict(
            #         range=[0,100],
            title='Percentage',
            titlefont=dict(
                size=14,
                color='black'
            ),
            tickfont=dict(
                family='Courier New, monospace',  # font family
                size=14,  # size of ticks displayed on the y axis
                color='black'  # color of the font
            )
        ),
        font=dict(
            family='Courier New, monospace',  # font family
            color="white",  # color of the font
            size=12  # size of the font displayed on the bar
        )
    )
    return layout_bar


def plotBar(dataframe_name, col_name, top_n=None):
    """
    Plot a bar chart for the categorical columns

    Arguments:
    dataframe name
    categorical column name

    Output:
    Plot
    """
    # create a table with value counts
    temp = dataframe_name[col_name].value_counts()
    if top_n is not None:
        temp = temp.head(top_n)
    # creating a Bar chart object of plotly
    data = [go.Bar(
            x=temp.index.astype(str),  # x axis values
            y=np.round(temp.values.astype(float) / temp.values.sum(), 4) * 100,  # y axis values
            text=['{}%'.format(i) for i in np.round(temp.values.astype(float) / temp.values.sum(), 4) * 100],
            # text to be displayed on the bar, we are doing this to display the '%' symbol along with the number on the bar
            textposition='auto',  # specify at which position on the bar the text should appear
            marker=dict(color='#0047AB'),)]  # change color of the bar
    # color used here Cobalt Blue

    layout_bar = generateLayoutBar(col_name=col_name)

    fig = go.Figure(data=data, layout=layout_bar)
    return iplot(fig)
In [5]:
outpath = "../data/"
In [6]:
os.listdir(outpath)
Out[6]:
['20ba1792-d-Data.zip', 'Data']
In [7]:
# for i in os.listdir(outpath):
#     z = zipfile.ZipFile(outpath + i)
#     z.extractall(path = outpath)
#     z.close()
In [8]:
for i in os.listdir(outpath):
    print(i)
20ba1792-d-Data.zip
Data
In [9]:
datapath = '../data/Data/'
In [10]:
for i in os.listdir(datapath):
    print(i)
Customer_Demographics.xlsx
Customer_Transaction.xlsx
RFM.csv
Store_Master.xlsx
Test_Set.xlsx
In [11]:
Customer_Demographics = pd.read_excel(datapath + 'Customer_Demographics.xlsx')
In [12]:
Customer_Transaction = pd.read_excel(datapath + 'Customer_Transaction.xlsx')
In [13]:
Store_Master = pd.read_excel(datapath + 'Store_Master.xlsx')
In [14]:
Test_Set = pd.read_excel(datapath + 'Test_Set.xlsx')
In [15]:
[i.shape for i in [Customer_Demographics, Customer_Transaction, Store_Master, Test_Set]]
Out[15]:
[(100000, 16), (544649, 15), (35, 15), (400000, 2)]

No information about the units given in the data. Assuming Revenue in dhirams.

Customer_Demographics

In [16]:
Customer_Demographics.head()
Out[16]:
Customer_ID Territory Nationality Income_Range Job_Type Marital_Status Gender State Language Loyalty_Status Birth_date Age Points First_txn_dt Last_accr_txn_dt Last_rdm_txn_dt
0 1800000006365760 United Arab Emirates INDIA Below 5000 Services Married M Abu Dhabi English Gold 01JUN1959:00:00:00 59.0 814.0 02NOV2010:00:00:00 27SEP2018:21:45:25 11MAR2018:19:11:57
1 1800000006365820 United Arab Emirates UNITED ARAB EMIRATES Below 5000 Services Married F Abu Dhabi Arabic Silver 01JAN1900:00:00:00 NaN 212.0 06DEC2010:00:00:00 16AUG2016:21:08:28 16MAR2015:13:39:12
2 1800000006366060 United Arab Emirates SRI LANKA Below 5000 Services Married M Abu Dhabi English Silver 07JUN1959:00:00:00 59.0 186.0 17FEB2011:00:00:00 31AUG2018:20:52:18 30JAN2017:19:19:19
3 1800000006366230 United Arab Emirates JORDAN Below 5000 Services Married F Abu Dhabi Arabic Silver 21JAN1962:00:00:00 57.0 126.0 30OCT2010:00:00:00 30JUL2018:20:30:05 20JUN2017:17:57:27
4 1800000006366570 United Arab Emirates OCCUPIED PALESTINIAN TERRITORY Below 5000 Services Married M Abu Dhabi Arabic Silver 01JAN1900:00:00:00 NaN 197.0 10NOV2010:00:00:00 03SEP2018:22:10:56 17JUN2017:23:08:28
In [17]:
Customer_Demographics.dtypes
Out[17]:
Customer_ID           int64
Territory            object
Nationality          object
Income_Range         object
Job_Type             object
Marital_Status       object
Gender               object
State                object
Language             object
Loyalty_Status       object
Birth_date           object
Age                 float64
Points              float64
First_txn_dt         object
Last_accr_txn_dt     object
Last_rdm_txn_dt      object
dtype: object
In [18]:
from datetime import datetime
In [19]:
datetime.strptime('Jun 1 2005  1:33PM', '%b %d %Y %I:%M%p')
Out[19]:
datetime.datetime(2005, 6, 1, 13, 33)
In [20]:
Customer_Demographics.First_txn_dt[0]
Out[20]:
'02NOV2010:00:00:00'
In [21]:
t = datetime.strptime('02NOV2010:00:00:00', '%d%b%Y:%H:%M:%S')
In [22]:
date_cols_demo = [ i for i in Customer_Demographics.columns if (i.endswith('dt')) or ('date' in i)]
In [23]:
date_cols_demo
Out[23]:
['Birth_date', 'First_txn_dt', 'Last_accr_txn_dt', 'Last_rdm_txn_dt']
In [24]:
Customer_Demographics.First_txn_dt.dtype
Out[24]:
dtype('O')
In [25]:
Customer_Demographics[date_cols_demo] = Customer_Demographics[date_cols_demo].astype('O')
In [26]:
def convertToDate(x):
    try:
        return datetime.strptime(x, '%d%b%Y:%H:%M:%S')
    except:
        return x
In [27]:
Customer_Demographics[date_cols_demo] = Customer_Demographics[date_cols_demo].applymap(lambda x : convertToDate(x))
Feature engineering
In [28]:
date_cols_demo
Out[28]:
['Birth_date', 'First_txn_dt', 'Last_accr_txn_dt', 'Last_rdm_txn_dt']
In [29]:
def extractColTypes(dataset):
    """This functions extracts numeric, categorical , datetime and boolean column types.
    Returns 4 lists with respective column types"""
    num_cols_list = [i for i in dataset.columns if dataset[i].dtype in ['int64','float64']]
    cat_cols_list = [i for i in dataset.columns if dataset[i].dtype in ['object']]
    date_cols_list = [i for i in dataset.columns if dataset[i].dtype in ['datetime64[ns]']]
    bool_cols_list = [i for i in dataset.columns if dataset[i].dtype in ['bool']]
    print ("Numeric Columns:", len(num_cols_list))
    print ("Categorical/Character Columns:", len(cat_cols_list))
    print ("Date Columns:",len(date_cols_list))
    print ("Boolean Columns:",len(bool_cols_list))
    return(num_cols_list,cat_cols_list,date_cols_list,bool_cols_list)
In [30]:
demo_num_cols_list,demo_cat_cols_list,demo_date_cols_list,demo_bool_cols_list = extractColTypes(Customer_Demographics)
Numeric Columns: 3
Categorical/Character Columns: 9
Date Columns: 4
Boolean Columns: 0
In [31]:
demo_cat_cols_list
Out[31]:
['Territory',
 'Nationality',
 'Income_Range',
 'Job_Type',
 'Marital_Status',
 'Gender',
 'State',
 'Language',
 'Loyalty_Status']
Days since first transaction
In [32]:
today = convertToDate('24NOV2018:00:00:00')
In [33]:
def getDaysDiff(x):
    t = -1*(x - today)
    return t.apply(lambda x : x.days)
In [34]:
Customer_Demographics['custSince'] = getDaysDiff(Customer_Demographics.First_txn_dt)
days since last accr and rdm transactions
In [35]:
Customer_Demographics['ds_accr']= getDaysDiff(Customer_Demographics.Last_accr_txn_dt)
In [36]:
Customer_Demographics['ds_rdm']= getDaysDiff(Customer_Demographics.Last_rdm_txn_dt)

Distribution of the categorical columns

In [37]:
for i in demo_cat_cols_list[1:]:
    plotBar(Customer_Demographics, i,top_n=10)

Some columns here like the Income and Marital status are unspecified, as of now we cannot decide if they will help in the prediction, but we should keep a close watch for these.

In [38]:
demo_num_cols_list
Out[38]:
['Customer_ID', 'Age', 'Points']
In [39]:
def plotHist(dataframe, col_name):
    data = [go.Histogram(x=dataframe[col_name],
       marker=dict(
        color='#CC0E1D',# Lava (#CC0E1D)
#         color = 'rgb(200,0,0)'   # you can provide color in HEX format or rgb format, genrally programmers prefer HEX format as it is a single string value and easy to pass as a variable
    ))]
    layout = go.Layout(title = "Histogram of {}".format(col_name))
    fig = go.Figure(data= data, layout=layout)
    return iplot(fig)
In [40]:
plotHist(Customer_Demographics, 'Age')

Looks like the age group of 30-40 are frequent visitors of the store, also we see some entries above 100, I will remove them

In [41]:
Customer_Demographics.Age = Customer_Demographics.Age.clip_upper(100)
In [42]:
plotHist(Customer_Demographics, 'Age')
In [43]:
plotHist(Customer_Demographics, 'Points')

Most people have 0-2000 points

In [44]:
Customer_Demographics[Customer_Demographics.Points>2000].shape
Out[44]:
(1049, 19)

Just above 1000 customers have more than 2000 points

Let us see if these customers are distinguishable from the rest

In [45]:
for i in demo_cat_cols_list[1:]:
    plotBar(Customer_Demographics[Customer_Demographics.Points>2000], i,top_n=10)

The distribution of all the above columns look very similar to those of the unfiltered data. There is no disctinguishing factor as such.

Does loyalty status have correlation with the number of days a customer was with the brand

In [46]:
def plotMultiBox(dataframe,col_name, num_col_name):
    data = []
    for i in dataframe[col_name].unique():
        trace = go.Box(y = dataframe[num_col_name][dataframe[col_name] == i],
                      name = i)
        data.append(trace)
    layout = go.Layout(title="Boxplot of levels in {} for {} column".format(col_name,num_col_name))
    fig = go.Figure(data=data, layout=layout)
    return (iplot(fig))
In [47]:
plotMultiBox(Customer_Demographics, 'Loyalty_Status', 'custSince')

We see that in general, gold customers are with the company longer

In [48]:
plotMultiBox(Customer_Demographics, 'Gender', 'custSince')

Females have been customers for a marginally more number of days than males.

In [49]:
plotMultiBox(Customer_Demographics, 'Income_Range', 'custSince')

Observations:

  • An interesting observation here, customers who earn more than 20,000 Dhirams have been with the company for over 2500 days, also there are no recent entries for such high income groups.
  • The recent entries are from <20000 dhiram earning group. May be the store now has more affordable goods.
  • This could also be a result of the location of the malls

Customer_Transaction

In [50]:
Customer_Transaction.columns
Out[50]:
Index(['Territory', 'Business', 'Year', 'Week', 'Store_Code', 'City_Name',
       'Store_Type', 'Transaction_Type', 'Return_Reason', 'Customer_ID',
       'Invoices', 'Item_Count', 'Revenue', 'Discount', 'Units_Sold'],
      dtype='object')
In [51]:
Customer_Transaction.dtypes
Out[51]:
Territory            object
Business             object
Year                  int64
Week                  int64
Store_Code            int64
City_Name            object
Store_Type           object
Transaction_Type     object
Return_Reason        object
Customer_ID           int64
Invoices              int64
Item_Count            int64
Revenue             float64
Discount            float64
Units_Sold            int64
dtype: object
In [52]:
Customer_Transaction.head(20)
Out[52]:
Territory Business Year Week Store_Code City_Name Store_Type Transaction_Type Return_Reason Customer_ID Invoices Item_Count Revenue Discount Units_Sold
0 United Arab Emirates Max 2010 0 60065 Dubai Stand alone Return Size Problem 1800000058056860 1 3 -169.0 0.0 -3
1 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000020917140 1 1 0.0 0.0 0
2 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000021265010 1 1 0.0 0.0 0
3 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000025693950 1 4 0.0 0.0 0
4 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000044104620 1 1 0.0 0.0 0
5 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000045703150 1 1 0.0 0.0 0
6 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000045740150 1 1 0.0 0.0 0
7 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000046079170 2 7 0.0 0.0 0
8 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000047493340 1 1 0.0 0.0 0
9 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000048130990 1 2 0.0 0.0 0
10 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000054019360 1 1 0.0 0.0 0
11 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000054841260 1 1 0.0 0.0 0
12 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000054994790 1 1 0.0 0.0 0
13 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000058101370 1 1 0.0 0.0 0
14 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000058661850 1 1 0.0 0.0 0
15 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000080131780 1 1 0.0 0.0 0
16 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000080464400 1 2 0.0 0.0 0
17 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000089762380 1 1 0.0 0.0 0
18 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000117190630 1 1 0.0 0.0 0
19 United Arab Emirates Max 2016 0 60053 NaN NaN Purchase NaN 1800000117238230 1 1 0.0 0.0 0
In [53]:
plotHist(Customer_Transaction,'Revenue')